{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import xgboost as xgb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.datasets import load_digits"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.cross_validation import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "digits = load_digits(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X = digits['data']\n",
    "y = digits['target']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "clf1 = xgb.XGBClassifier()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\tvalidation_0-auc:0.999497\n",
      "Will train until validation_0-auc hasn't improved in 5 rounds.\n",
      "[1]\tvalidation_0-auc:0.999497\n",
      "[2]\tvalidation_0-auc:0.999497\n",
      "[3]\tvalidation_0-auc:0.999749\n",
      "[4]\tvalidation_0-auc:0.999749\n",
      "[5]\tvalidation_0-auc:0.999749\n",
      "[6]\tvalidation_0-auc:0.999749\n",
      "[7]\tvalidation_0-auc:0.999749\n",
      "[8]\tvalidation_0-auc:0.999749\n",
      "Stopping. Best iteration:\n",
      "[3]\tvalidation_0-auc:0.999749\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,\n",
       "       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,\n",
       "       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,\n",
       "       objective='binary:logistic', reg_alpha=0, reg_lambda=1,\n",
       "       scale_pos_weight=1, seed=0, silent=True, subsample=1)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf1.fit(X_train, y_train, early_stopping_rounds=5, eval_metric='auc',\n",
    "         eval_set=[(X_test, y_test)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.999749"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf1.best_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\tvalidation_0-auc:0.999497\n",
      "Will train until validation_0-auc hasn't improved in 4 rounds.\n",
      "[1]\tvalidation_0-auc:0.999497\n",
      "[2]\tvalidation_0-auc:0.999497\n",
      "[3]\tvalidation_0-auc:0.999749\n",
      "[4]\tvalidation_0-auc:0.999749\n",
      "[5]\tvalidation_0-auc:0.999749\n",
      "[6]\tvalidation_0-auc:0.999749\n",
      "[7]\tvalidation_0-auc:0.999749\n",
      "Stopping. Best iteration:\n",
      "[3]\tvalidation_0-auc:0.999749\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,\n",
       "       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,\n",
       "       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,\n",
       "       objective='binary:logistic', reg_alpha=0, reg_lambda=1,\n",
       "       scale_pos_weight=1, seed=0, silent=True, subsample=1)"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf2 = xgb.XGBClassifier()\n",
    "clf2.fit(X_train, y_train, early_stopping_rounds=4, eval_metric=\"auc\",\n",
    "         eval_set=[(X_test, y_test)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.999749"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf2.best_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0]\tvalidation_0-auc:0.999497\n",
      "Will train until validation_0-auc hasn't improved in 10 rounds.\n",
      "[1]\tvalidation_0-auc:0.999497\n",
      "[2]\tvalidation_0-auc:0.999497\n",
      "[3]\tvalidation_0-auc:0.999749\n",
      "[4]\tvalidation_0-auc:0.999749\n",
      "[5]\tvalidation_0-auc:0.999749\n",
      "[6]\tvalidation_0-auc:0.999749\n",
      "[7]\tvalidation_0-auc:0.999749\n",
      "[8]\tvalidation_0-auc:0.999749\n",
      "[9]\tvalidation_0-auc:0.999749\n",
      "[10]\tvalidation_0-auc:1\n",
      "[11]\tvalidation_0-auc:1\n",
      "[12]\tvalidation_0-auc:1\n",
      "[13]\tvalidation_0-auc:1\n",
      "[14]\tvalidation_0-auc:1\n",
      "[15]\tvalidation_0-auc:1\n",
      "[16]\tvalidation_0-auc:1\n",
      "[17]\tvalidation_0-auc:1\n",
      "[18]\tvalidation_0-auc:1\n",
      "[19]\tvalidation_0-auc:1\n",
      "[20]\tvalidation_0-auc:1\n",
      "Stopping. Best iteration:\n",
      "[10]\tvalidation_0-auc:1\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,\n",
       "       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,\n",
       "       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,\n",
       "       objective='binary:logistic', reg_alpha=0, reg_lambda=1,\n",
       "       scale_pos_weight=1, seed=0, silent=True, subsample=1)"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 是否过拟合\n",
    "clf3 = xgb.XGBClassifier()\n",
    "clf3.fit(X_train, y_train, early_stopping_rounds=10, eval_metric=\"auc\",\n",
    "         eval_set=[(X_test, y_test)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1.0"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf3.best_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 交叉验证和early stopping结合起来"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 使用xgboost提供的cv功能，而不是sklearn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "dm = xgb.DMatrix(X, label=y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "10"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def evalerror(preds, dtrain):\n",
    "    from sklearn.metrics import mean_squared_error\n",
    "    labels = dtrain.get_label()\n",
    "    return 'rmse', mean_squared_error(labels, preds)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 使用自定义的评价指标\n",
    "cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, feval=evalerror, maximize=True, early_stopping_rounds=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cv.shape[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
